Built with R 4.0.3
Last saved on 05 February 2022 at 15:41
knitr::opts_chunk$set(echo = TRUE, tidy = TRUE, paged.print=TRUE, fig.width = 10, warning=FALSE, cache = TRUE)
library(car) # For data wrangling
library(caret) # For its confusion matrix function
#library(clipr) # For quick exports to other programme
library(cowplot)
library(DescTools) # For 95% CI
library(dplyr)
library(emmeans)
#library(FactoMineR) # For Shiny app
library(factoextra) # For circular graphs of variables
library(forcats) # For the fct_relevel function
library(here) # For dynamic file paths
library(ggplot2)
library(ggthemes) # For theme of factoextra plots
library(lme4) # For linear regression modelling
library(patchwork) # To create figures with more than one plot
#library(pca3d) # For 3-D plots (not rendered in html knit)
library(PCAtools) # For nice biplots of PCA results
library(purrr) # For data wrangling
library(psych) # For various useful stats function
library(sjPlot) # For model plots and tables
library(suffrager) # For pretty feminist colour palettes :)
library(visreg) # For plots of interaction effects
source(here("R_rainclouds.R")) # For geom_flat_violin rainplots
Import full data (see 7_Ref_data_prep.Rmd for data preparation steps)
data <- readRDS(here("FullMDA", "dataforPCA.rds"))
This chunk can be used to perform the MDA on various subsets of the data.
# Full dataset
data <- readRDS(here("FullMDA", "dataforPCA.rds"))
# Subset of the data that excludes levels A and B textbooks
data <- readRDS(here("FullMDA", "dataforPCA.rds")) %>%
filter(Level !="A" & Level != "B") %>%
droplevels()
summary(data$Level)
# Subset of the data to only include one Country subcorpus of the TEC
data <- readRDS(here("FullMDA", "dataforPCA.rds")) %>%
#filter(Country != "France" & Country != "Germany") %>% # Spain only
#filter(Country != "France" & Country != "Spain") %>% # Germany only
filter(Country != "Spain" & Country != "Germany") %>% # France only
droplevels()
summary(data$Country)
# Perform PCA on random subset of the data to test the stability of the solution. Re-running this line will generate a new subset of 2/3 of the texts randomly sampled.
data <- readRDS(here("FullMDA", "dataforPCA.rds")) %>%
slice_sample(n = 4980*0.6, replace = FALSE)
nrow(data)
This chunk is used to create the 3-D plots. These cannot be rendered in the html knit.
colnames(data)
pca <- prcomp(data[,9:ncol(data)], scale.=FALSE) # All quantitative variables
register <- factor(data[,"Register"])
corpus <- factor(data[,"Corpus"])
subcorpus <- factor(data[,"Subcorpus"])
summary(register)
summary(corpus)
summary(subcorpus)
summary(pca)
# 3-D plot
colours <- suf_palette(name = "london", n = 6, type = "continuous")
colours2 <- suf_palette(name = "classic", n = 5, type = "continuous")
colours <- c(colours, colours2[c(2:4)]) # Nine colours range
col6 <- colours[c(6,5,4,7,9,2)] # Good order for PCA
scales::show_col(col6)
col6 <- c("#F9B921", "#A18A33", "#722672", "#BD241E", "#267226", "#15274D")
names(col6) <- c("Textbook Conversation", "Textbook Fiction", "Textbook Informative", "Spoken BNC2014 Ref.", "Youth Fiction Ref.", "Info Teens Ref.")
shapes6 <- c(rep("cube", 3),rep("sphere", 3))
names(shapes6) <- c("Textbook Conversation", "Textbook Fiction", "Textbook Informative", "Spoken BNC2014 Ref.", "Youth Fiction Ref.", "Info Teens Ref.")
pca3d(pca, group = subcorpus,
components = 1:3,
#components = 4:6,
show.plane=FALSE,
col = col6,
shape = shapes6,
radius = 0.7,
legend = "right")
## Looking at all three Textbook English registers in one colour
col4 <- colours[c(1,3,7,9)]
col4 <- c("#EA7E1E", "#15274D", "#BD241E", "#267226")
names(col4) <- c("Textbook.English", "Informative.Teens", "Spoken.BNC2014", "Youth.Fiction")
shapes4 <- c("cube", rep("sphere", 3))
names(shapes4) <- c("Textbook.English", "Informative.Teens", "Spoken.BNC2014", "Youth.Fiction")
pca3d(pca, group = corpus,
show.plane=FALSE,
components = 1:3,
col = col4,
shape = shapes4,
radius = 0.7,
legend = "right")
data2 <- data %>%
mutate(Source = recode_factor(Corpus, Textbook.English = "Textbook English (TEC)", Informative.Teens = "Reference corpora", Spoken.BNC2014 = "Reference corpora", Youth.Fiction = "Reference corpora")) %>%
mutate(Corpus = fct_relevel(Subcorpus, "Info Teens Ref.", after = 9)) %>%
relocate(Source, .after = "Corpus") %>%
droplevels(.)
colnames(data2)
## [1] "Filename" "Register" "Level" "Series" "Country" "Corpus"
## [7] "Source" "Subcorpus" "Words" "ACT" "AMP" "ASPECT"
## [13] "AWL" "BEMA" "CAUSE" "CC" "COMM" "CONC"
## [19] "COND" "CONT" "CUZ" "DEMO" "DMA" "DOAUX"
## [25] "DT" "DWNT" "ELAB" "EMPH" "EX" "EXIST"
## [31] "FPP1P" "FPP1S" "FPUH" "GTO" "HDG" "HGOT"
## [37] "IN" "JJAT" "JJPR" "LD" "MDCA" "MDCO"
## [43] "MDMM" "MDNE" "MDWO" "MDWS" "MENTAL" "NCOMP"
## [49] "NN" "OCCUR" "PASS" "PEAS" "PIT" "PLACE"
## [55] "POLITE" "POS" "PROG" "QUAN" "QUPR" "QUTAG"
## [61] "RB" "RP" "SPLIT" "SPP2" "STPR" "THATD"
## [67] "THRC" "THSC" "TTR" "VBD" "VBG" "VBN"
## [73] "VIMP" "WHQU" "WHSC" "XX0" "YNQU" "TPP3"
## [79] "FQTI"
data2meta <- data2[,1:9]
rownames(data2meta) <- data2meta$Filename
data2meta <- data2meta %>% select(-Filename)
head(data2meta)
## Register Level Series Country
## Achievers_B1_plus_Informative_0007.txt Informative D Achievers Spain
## POC_5e_Spoken_0003.txt Conversation B POC France
## Access_4_Narrative_0013.txt Fiction D Access Germany
## NGL_1_Spoken_0002.txt Conversation A NGL Germany
## Access_1_Narrative_0005.txt Fiction A Access Germany
## NGL_2_Narrative_0007.txt Fiction B NGL Germany
## Corpus
## Achievers_B1_plus_Informative_0007.txt Textbook Informative
## POC_5e_Spoken_0003.txt Textbook Conversation
## Access_4_Narrative_0013.txt Textbook Fiction
## NGL_1_Spoken_0002.txt Textbook Conversation
## Access_1_Narrative_0005.txt Textbook Fiction
## NGL_2_Narrative_0007.txt Textbook Fiction
## Source
## Achievers_B1_plus_Informative_0007.txt Textbook English (TEC)
## POC_5e_Spoken_0003.txt Textbook English (TEC)
## Access_4_Narrative_0013.txt Textbook English (TEC)
## NGL_1_Spoken_0002.txt Textbook English (TEC)
## Access_1_Narrative_0005.txt Textbook English (TEC)
## NGL_2_Narrative_0007.txt Textbook English (TEC)
## Subcorpus Words
## Achievers_B1_plus_Informative_0007.txt Textbook Informative 690
## POC_5e_Spoken_0003.txt Textbook Conversation 694
## Access_4_Narrative_0013.txt Textbook Fiction 547
## NGL_1_Spoken_0002.txt Textbook Conversation 927
## Access_1_Narrative_0005.txt Textbook Fiction 840
## NGL_2_Narrative_0007.txt Textbook Fiction 1127
rownames(data2) <- data2$Filename
data2num <- as.data.frame(base::t(data2[,10:ncol(data2)]))
data2num[1:5,1:5] # Check data frame format is correct
## Achievers_B1_plus_Informative_0007.txt POC_5e_Spoken_0003.txt
## ACT 1.3305834 -0.8133966
## AMP -0.4504541 0.1376718
## ASPECT 1.0228557 -0.4515438
## AWL 0.7288132 -0.7268089
## BEMA -0.4801820 1.1272237
## Access_4_Narrative_0013.txt NGL_1_Spoken_0002.txt
## ACT -0.4707243 -1.014328
## AMP 1.4534008 -0.561931
## ASPECT 1.1039502 -0.749580
## AWL -0.5477092 -0.716272
## BEMA 0.2456594 1.700641
## Access_1_Narrative_0005.txt
## ACT 0.03320131
## AMP -0.52977306
## ASPECT -0.55344739
## AWL -0.56332789
## BEMA -0.39233698
p <- PCAtools::pca(data2num,
metadata = data2meta,
scale = FALSE)
p$variance[1:6]
## PC1 PC2 PC3 PC4 PC5 PC6
## 30.334148 7.512979 5.886828 3.415906 2.703581 2.415911
sum(p$variance[1:6])
## [1] 52.26935
# For five TEC registers
# colkey = c(`Spoken BNC2014 Ref.`="#BD241E", `Info Teens Ref.`="#15274D", `Youth Fiction Ref.`="#267226", `Textbook Fiction`="#A18A33", `Textbook Conversation`="#F9B921", `Textbook Informative` = "#722672", `Textbook Instructional` = "grey", `Textbook Personal` = "black")
# For three TEC registers
summary(data2$Corpus)
## Textbook Conversation Textbook Fiction Textbook Informative
## 565 285 352
## Spoken BNC2014 Ref. Youth Fiction Ref. Info Teens Ref.
## 1250 1191 1337
colkey = c(`Spoken BNC2014 Ref.`="#BD241E", `Info Teens Ref.`="#15274D", `Youth Fiction Ref.`="#267226", `Textbook Fiction`="#A18A33", `Textbook Conversation`="#F9B921", `Textbook Informative` = "#722672")
summary(data2$Source)
## Textbook English (TEC) Reference corpora
## 1202 3778
shapekey = c(`Textbook English (TEC)`=6, `Reference corpora`=1)
## Very slow, open in zoomed out window!
# Add legend manually? Yes (take it from the biplot code below), let's not waste too much time, here. Or use Evert's mvar.pairs plot (though that also requires manual axis annotation!)
PCAtools::pairsplot(p,
triangle = FALSE,
components = 1:6,
pointSize = 0.4,
shape = "Source",
shapekey = shapekey,
lab = NULL, # Otherwise will try to label each data point!
colby = "Corpus",
colkey = colkey)
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.